In [1]:
# -*-coding:utf-8-*-
%matplotlib inline
import os
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cross_validation import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from bald_latin import remove_cyrillic_and_accents as balden
In [2]:
# comments collected by the "Lovac na sendvice" app
def load_comments_and_labels():
lns_comments = balden(open('dataset/lns/lns_comments.txt', 'r').readlines())
lns_stemmed = open('dataset/lns/lns_comments_stemmed.txt', 'r').readlines()
lns_labels = open('dataset/lns/lns_labels.txt', 'r').readlines()
# remove cyrillic and accents on stemmed comments
lns_stemmed, lns_labels = balden(lns_stemmed, lns_labels)
# labels as a numpy array
lns_labels = np.array([int(float(x)) for x in lns_labels])
assert len(lns_comments) == len(lns_stemmed)
return lns_comments, lns_stemmed, lns_labels
def load_scraped():
# scraped comments from Blic.rs
scraped_comments, scraped_stemmed = balden(
open('dataset/scraped/comments.txt').readlines(),
open('dataset/scraped/comments_stemmed.txt').readlines())
assert len(scraped_comments) == len(scraped_stemmed)
return scraped_comments, scraped_stemmed
def load_scraped_not_category():
scraped_nots_comments, scraped_nots_stemmed = balden(
open('dataset/scraped/slobodno_vreme.txt').readlines(),
open('dataset/scraped/slobodno_vreme_stemmed.txt').readlines())
assert len(scraped_nots_comments) == len(scraped_nots_stemmed)
return scraped_nots_comments, scraped_nots_stemmed
lns_comments, lns_stemmed, lns_labels = load_comments_and_labels()
print "Loaded LnS comments and labels"
scraped_comments, scraped_stemmed = load_scraped()
print "Loaded scraped comments"
scraped_nots_comments, scraped_nots_stemmed = load_scraped_not_category()
print "Loaded scraped nots"
In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
def build_vectorizer():
croatian_stop_words = set([u"a",u"ako",u"ali",u"bi",u"bih",u"bila",u"bili",u"bilo",u"bio",u"bismo",u"biste",u"biti",u"bumo",u"da",u"do",u"duž",u"ga",u"hoće",u"hoćemo",u"hoćete",u"hoćeš",u"hoću",u"i",u"iako",u"ih",u"ili",u"iz",u"ja",u"je",u"jedna",u"jedne",u"jedno",u"jer",u"jesam",u"jesi",u"jesmo",u"jest",u"jeste",u"jesu",u"jim",u"joj",u"još",u"ju",u"kada",u"kako",u"kao",u"koja",u"koje",u"koji",u"kojima",u"koju",u"kroz",u"li",u"me",u"mene",u"meni",u"mi",u"mimo",u"moj",u"moja",u"moje",u"mu",u"na",u"nad",u"nakon",u"nam",u"nama",u"nas",u"naš",u"naša",u"naše",u"našeg",u"ne",u"nego",u"neka",u"neki",u"nekog",u"neku",u"nema",u"netko",u"neće",u"nećemo",u"nećete",u"nećeš",u"neću",u"nešto",u"ni",u"nije",u"nikoga",u"nikoje",u"nikoju",u"nisam",u"nisi",u"nismo",u"niste",u"nisu",u"njega",u"njegov",u"njegova",u"njegovo",u"njemu",u"njezin",u"njezina",u"njezino",u"njih",u"njihov",u"njihova",u"njihovo",u"njim",u"njima",u"njoj",u"nju",u"no",u"o",u"od",u"odmah",u"on",u"ona",u"oni",u"ono",u"ova",u"pa",u"pak",u"po",u"pod",u"pored",u"prije",u"s",u"sa",u"sam",u"samo",u"se",u"sebe",u"sebi",u"si",u"smo",u"ste",u"su",u"sve",u"svi",u"svog",u"svoj",u"svoja",u"svoje",u"svom",u"ta",u"tada",u"taj",u"tako",u"te",u"tebe",u"tebi",u"ti",u"to",u"toj",u"tome",u"tu",u"tvoj",u"tvoja",u"tvoje",u"u",u"uz",u"vam",u"vama",u"vas",u"vaš",u"vaša",u"vaše",u"već",u"vi",u"vrlo",u"za",u"zar",u"će",u"ćemo",u"ćete",u"ćeš",u"ću",u"što"])
# build tf-idf vectorizer which uses unigrams and bigrams.
# uses words with 2+ occurances as features
vectorizer = TfidfVectorizer(
strip_accents="unicode",
lowercase=True,
ngram_range=(1, 2),
min_df=10,
norm='l2',
smooth_idf=True,
use_idf=True,
stop_words=croatian_stop_words)
return vectorizer
In [5]:
def classify_bots(text_train, y_train, unlabeled_stemmed, threshold=0.83):
"""
Train the classifier on text_train and y_train,
and label bots in the unlabeled stemmed comments, with probability above the threshold.
Return indices of bot comments
"""
# build the dataset, vectorize it using TF-IDF
vectorizer = build_vectorizer()
X_train = vectorizer.fit_transform(text_train)
X_unlabeled = vectorizer.transform(unlabeled_stemmed)
# create and fit the classifier
clf = MultinomialNB().fit(X_train, y_train)
# predict on the unlabeled set
y_pred = clf.predict_proba(X_unlabeled)[:,1]
# select comments with very high or low probabilities
bot_indices = np.argwhere(y_pred > threshold)
return bot_indices
def build_large_comment_set(lns_comments, lns_labels, scraped_bots, scraped_nots, weight=0.1):
"""
Build dataset from the original unstemmed 'Lovac na Sendvice' comments,
newly classified bot comments, and manually tagged not comments.
"""
combined_comments = lns_comments + scraped_bots + scraped_nots
combined_labels = list(lns_labels) + list(np.ones(len(scraped_bots))) + list(np.zeros(len(scraped_nots)))
combined_labels = np.array(combined_labels)
# the weigths of the new samples are decreased
combined_weights = np.ones(len(combined_labels))
combined_weights[len(lns_comments):] = weight
return combined_comments, combined_labels, combined_weights
def comments2matrix(comments):
"""
Prepare the comments to be fed to the LSTM
"""
def remove_symbols(comments):
# replace characters, reduce set
bad_chars = ['\n', '\t', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']
def clean_comment(comment):
comment = comment.encode('ascii', errors='ignore').lower()
for bc in bad_chars:
comment = comment.replace(bc, ' ')
return comment
return map(clean_comment, comments)
def pad_comments(comments, size=100):
"""
Pad each comment to a *size* characters. Longer comments get cut off.
"""
def pad(comment):
comment = comment[:size]
comment = comment + " " * (size - len(comment))
return comment
return map(pad, comments)
def one_hot(comments):
#char_set = list(set("".join(comments)))
char_set = set([' ', '1', '0', '3', '2', '5', '4', '7', '6', '9', '8', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'i', 'h', 'k', 'j', 'm', 'l', 'o', 'n', 'q', 'p', 's', 'r', 'u', 't', 'w', 'v', 'y', 'x', 'z'])
char_set_str = "".join(char_set)
char_set_len = len(char_set)
# TODO optimize? should use sparse matrices
# Currently X is < 1GB, which is kinda ok
X = np.zeros((len(comments), len(comments[0]), char_set_len))
for comment_ind, comment in enumerate(comments):
for char_ind, char in enumerate(comment):
X[comment_ind, char_ind, char_set_str.find(char)] = 1
return X
# remove symbols, pad, and vectorize
return one_hot(pad_comments(remove_symbols(comments)))
def y_one_hot(y):
"""
Two categories require two neurons in the output. Must convert y to a one-hot representation.
"""
one_hot = np.zeros((len(y), 2))
one_hot[np.arange(len(y)), np.round(y).astype(int)] = 1
return one_hot
In [6]:
def build_training_and_test_set(lns_comments, lns_stemmed, lns_labels, unlabeled_comments, unlabeled_stemmed, scraped_nots):
ratio = 0.8
# Split the dataset into k folds, each fold is a test set for one iteration
#train = range(int(len(lns_comments) * ratio))
#test = range(int(len(lns_comments) * ratio), len(lns_comments))
from sklearn.cross_validation import train_test_split
train, test = train_test_split(range(len(lns_comments)))
text_train_stemmed = [lns_stemmed[x] for x in train]
text_train_comments = [lns_comments[x] for x in train]
text_test_stemmed = [lns_stemmed[x] for x in test]
text_test_comments = [lns_comments[x] for x in test]
y_train = [lns_labels[x] for x in train]
y_test = [lns_labels[x] for x in test]
print("Built the training dataset")
bot_indices = classify_bots(text_train_stemmed, y_train, unlabeled_stemmed)
print("Classified bots using NB")
# using the NB predictions, pull the bots from the original comments
classified_bots = [unlabeled_comments[x] for x in bot_indices]
# and take the same number of comments from the not categories
classified_nots = scraped_nots[:len(classified_bots)]
# build a large dataset from LnS comments and NB labeled comments. NB comments have lower weigths.
comments_large, y_large, weigths_large = build_large_comment_set(
text_train_comments,
y_train,
classified_bots,
classified_nots)
# shuffle text_train_comments and y_train
from sklearn.utils import shuffle
comments_large, y_large = shuffle(comments_large, y_large)
y_large = y_one_hot(y_large)
y_test = y_one_hot(y_test)
print("Built the training set for the LSTM")
X_large = comments2matrix(comments_large)
X_test = comments2matrix(text_test_comments)
print("Vectorized the training set")
return X_large, y_large, X_test, y_test
max_scraped = 3*10**6 # can't wait
unlabeled_comments = scraped_comments[:max_scraped]
unlabeled_stemmed = scraped_stemmed[:max_scraped]
X_train, y_train, X_test, y_test = build_training_and_test_set(lns_comments, lns_stemmed, lns_labels, unlabeled_comments, unlabeled_stemmed, scraped_nots_comments)
In [ ]:
def build_net(shape, nb_filter=64, pool_length=2, lstm_output_size=128):
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, GRU, Input, merge, Activation,Convolution1D, MaxPooling1D, Flatten, Convolution2D
model = Sequential()
model.add(Convolution1D(input_shape=(shape[1], shape[2]),
nb_filter=16, filter_length=11, border_mode='valid', activation='relu', subsample_length=1))
model.add(Convolution1D(nb_filter=16, filter_length=11, border_mode='valid', activation='relu', subsample_length=1))
model.add(MaxPooling1D(pool_length=pool_length))
model.add(Convolution1D(nb_filter=32, filter_length=9, border_mode='valid', activation='relu', subsample_length=1))
model.add(Convolution1D(nb_filter=32, filter_length=9, border_mode='valid', activation='relu', subsample_length=1))
model.add(MaxPooling1D(pool_length=pool_length))
model.add(Convolution1D(nb_filter=64, filter_length=7, border_mode='valid', activation='relu', subsample_length=1))
model.add(Convolution1D(nb_filter=64, filter_length=7, border_mode='valid', activation='relu', subsample_length=1))
model.add(MaxPooling1D(pool_length=pool_length))
#model.add(Convolution1D(nb_filter=16, filter_length=5, border_mode='valid', activation='relu', subsample_length=1))
#model.add(Convolution1D(nb_filter=16, filter_length=5, border_mode='valid', activation='relu', subsample_length=1))
#model.add(MaxPooling1D(pool_length=pool_length))
#model.add(Convolution1D(nb_filter=16, filter_length=3, border_mode='valid', activation='relu', subsample_length=1))
#model.add(MaxPooling1D(pool_length=pool_length))
#model.add(Flatten())
#model.add(LSTM(lstm_output_size, return_sequences=True))
#model.add(LSTM(64, input_shape=(shape[1], shape[0]), dropout_U=0.5, dropout_W=0.3, return_sequences=True))
model.add(LSTM(64, dropout_U=0.5, dropout_W=0.3, return_sequences=True))
#model.add(GRU(64, return_sequences=True, activation='softsign', input_shape=(shape[1], shape[2])))
#model.add(Flatten())
#model.add(Dropout(0.2))
#model.add(GRU(10, return_sequences=False, activation='softsign'))
model.add(LSTM(64, dropout_U=0.3, dropout_W=0.3, return_sequences=False))
#model.add(GRU(32, return_sequences=False))
model.add(Dense(20))
model.add(Dense(2))
# TODO replace with softmax
model.add(Activation('softmax'))
# try using different optimizers and different optimizer configs
from keras.optimizers import SGD
#sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
#model.compile(sgd, 'binary_crossentropy', metrics=['accuracy'])
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
return model
net = build_net(X_train.shape)
print("Built the net")
epoch = 100
batch_size = 64
max_samples = 20000
for ep in range(epoch):
net.fit(X_train[:max_samples], y_train[:max_samples],
batch_size=batch_size,
nb_epoch=1,
validation_data=[X_test, y_test])
In [ ]: